Load packages and set directories
library(boot)
library(tidyverse)
library(broom)
setwd('~/Library/Mobile Documents/com~apple~CloudDocs/Udveksling/Studie/LS 88 - 3/Project/ls88_project-master')
Load data
df = read_csv('dataframe.csv')
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
.default = col_integer(),
season = col_character(),
date = col_datetime(format = ""),
home_team_name = col_character(),
away_team_name = col_character(),
home_poss = col_double(),
away_poss = col_double()
)
See spec(...) for full column specifications.
Retain only non-nominal features
df <- df %>%
select(-X1, -id, -country_id, -league_id, -stage, -date, -match_api_id, -home_team_api_id,
-away_team_api_id, -home_team_name, -away_team_name, -season)
Add Goal Difference as target variable and print distribution
df <- df %>%
mutate(goal_diff = home_team_goal - away_team_goal)
hist(df$goal_diff)
Looks perfectly normaly distributed.
As the target variable is derived from home_team_goal and away_team_goal we drop these variables.
df <- df %>%
select(-home_team_goal, -away_team_goal)
Based on the correlation matrix from the Jupyter Notebook, we will drop variables: home_points away_points
As these 2 variables have too much correlation with the rest of the variables and are interdependent.
df <- df %>%
select(-home_points, -away_points)
We scale the variables without centering to not have certain variables dominate too much
df_scaled <- as.data.frame(scale(df))
Fitting a linear model (iteration 1) We have: Response variable: Goal Difference Explanatory variables: All Others + Intercept
model1 <- lm(goal_diff ~ .,
data = df)
#Print Summary
print(summary(model1))
Call:
lm(formula = goal_diff ~ ., data = df)
Residuals:
Min 1Q Median 3Q Max
-6.7374 -1.0369 -0.0032 0.9892 7.7300
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.0339389 0.7745341 -1.335 0.182004
home_y1 -0.2117143 0.0285360 -7.419 1.52e-13 ***
home_y2 -0.8705695 0.2043061 -4.261 2.10e-05 ***
home_r -1.0733716 0.1484183 -7.232 6.00e-13 ***
away_y1 0.0132646 0.0263142 0.504 0.614239
away_y2 0.5317117 0.1447930 3.672 0.000245 ***
away_r 0.9035694 0.1321295 6.839 9.64e-12 ***
home_fouls 0.0143885 0.0093812 1.534 0.125194
away_fouls -0.0183639 0.0091475 -2.008 0.044783 *
home_poss 0.0507326 0.0077174 6.574 5.76e-11 ***
away_poss -0.0096464 0.0077888 -1.239 0.215624
home_shoton -0.0031848 0.0100736 -0.316 0.751905
away_shoton 0.0001503 0.0119034 0.013 0.989927
home_shotoff -0.0275419 0.0110350 -2.496 0.012618 *
away_shotoff -0.0120150 0.0126634 -0.949 0.342796
home_corners -0.0270724 0.0114862 -2.357 0.018490 *
away_corners 0.0036774 0.0131484 0.280 0.779742
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.663 on 3023 degrees of freedom
Multiple R-squared: 0.141, Adjusted R-squared: 0.1365
F-statistic: 31.02 on 16 and 3023 DF, p-value: < 2.2e-16
Plot Model1
#Plot residuals
plot(model1$residuals)
Residuals look great. (centered around 0 with SD approximate to 1).
The intercept is insignificant. Will be removed. Model 2 Removing intercept.
model2 <- lm(goal_diff ~ . -1,
data = df)
#Plot residuals
plot(model2$residuals)
#Print Summary
print(summary(model2))
Call:
lm(formula = goal_diff ~ . - 1, data = df)
Residuals:
Min 1Q Median 3Q Max
-6.7571 -1.0326 -0.0055 0.9821 7.7238
Coefficients:
Estimate Std. Error t value Pr(>|t|)
home_y1 -0.2112893 0.0285379 -7.404 1.71e-13 ***
home_y2 -0.8668292 0.2043133 -4.243 2.28e-05 ***
home_r -1.0731757 0.1484374 -7.230 6.10e-13 ***
away_y1 0.0124134 0.0263099 0.472 0.637092
away_y2 0.5356928 0.1447810 3.700 0.000219 ***
away_r 0.9033008 0.1321464 6.836 9.84e-12 ***
home_fouls 0.0131246 0.0093345 1.406 0.159820
away_fouls -0.0194850 0.0091101 -2.139 0.032528 *
home_poss 0.0411316 0.0027986 14.697 < 2e-16 ***
away_poss -0.0193323 0.0028321 -6.826 1.05e-11 ***
home_shoton -0.0041805 0.0100472 -0.416 0.677375
away_shoton -0.0006627 0.0118894 -0.056 0.955556
home_shotoff -0.0291513 0.0109704 -2.657 0.007919 **
away_shotoff -0.0142992 0.0125488 -1.139 0.254593
home_corners -0.0280717 0.0114633 -2.449 0.014388 *
away_corners 0.0030107 0.0131407 0.229 0.818795
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.663 on 3024 degrees of freedom
Multiple R-squared: 0.1798, Adjusted R-squared: 0.1754
F-statistic: 41.43 on 16 and 3024 DF, p-value: < 2.2e-16
Notice how home_shotoff is significant with a negative effect on the goal difference (teams who miss a lot of shots at home will achieve a worse goal difference). It is also noticeable how getting a lot of corners on your home turf decreases your goal difference. Surprisingly, shots on goal do not seem to have a significant effect.
| BOOTSTRAPPING |
Non-parametric bootstrapping for the coeffiecents with 5000 models. We do not know the true distribution and use nonparametric bootstrap sampling with replacement accordingly. This will use the empirical distribution.
###Code copied from statmethods.net
#Function to obatin regression weights
bs <- function(formula, data, indices){
d <- data[indices, ] #allows boot to select sample
fit <- lm(formula, data = d)
return(coef(fit))
}
#Creating own boostrap method
bootstrap <- function(data, formula, k){
return.matrix <- matrix(nrow = k, ncol = 16)
set.seed(200)
for(i in 1:k){
sample_temp <- sample_n(data, size = nrow(data), replace = TRUE)
model_temp <- lm(sample_temp,
formula = formula)
return.matrix[i, ] <- coefficients(model_temp)
}
return(return.matrix)
}
#results_own <- bootstrap(df_scaled, goal_diff ~. - 1, 5000)
#Bootstrapping with 5000 replications
results <- boot(df,
statistic = bs,
R = 5000,
formula = goal_diff ~ . - 1)
Plot distributions of parameters
for(i in 1:length(results$t0)){
hist(results$t[,i],
prob = TRUE,
main = colnames(df)[i])
abline(v = coefficients(model2)[i], col = 'red')#
}
coefficients(model2) - apply(results$t, 2, mean)
home_y1 home_y2 home_r away_y1 away_y2 away_r home_fouls
2.340095e-04 1.018827e-03 1.942471e-03 -2.651094e-04 2.172175e-03 -1.336380e-03 1.045057e-04
away_fouls home_poss away_poss home_shoton away_shoton home_shotoff away_shotoff
-1.521606e-04 -2.336063e-05 3.514283e-05 -1.683637e-04 2.079877e-04 1.375954e-04 -2.421452e-04
home_corners away_corners
-2.557286e-05 1.752649e-04
| SUMMARY |
It was found that parameter values were all extremely likely based on the 5000 repetions of nonparametric bootstrapping based on the histograms below, where the red line represents the found parameter value for the model.
for(i in 1:length(results$t0)){
hist(results$t[,i],
prob = TRUE,
main = colnames(df)[i])
abline(v = coefficients(model2)[i], col = 'red')#
}
The fitted model (model 2 without intercept) showed well-behaved residuals and significance on 10/16 parameters
plot(model2$residuals)
summary(model2)
Call:
lm(formula = goal_diff ~ . - 1, data = df)
Residuals:
Min 1Q Median 3Q Max
-6.7571 -1.0326 -0.0055 0.9821 7.7238
Coefficients:
Estimate Std. Error t value Pr(>|t|)
home_y1 -0.2112893 0.0285379 -7.404 1.71e-13 ***
home_y2 -0.8668292 0.2043133 -4.243 2.28e-05 ***
home_r -1.0731757 0.1484374 -7.230 6.10e-13 ***
away_y1 0.0124134 0.0263099 0.472 0.637092
away_y2 0.5356928 0.1447810 3.700 0.000219 ***
away_r 0.9033008 0.1321464 6.836 9.84e-12 ***
home_fouls 0.0131246 0.0093345 1.406 0.159820
away_fouls -0.0194850 0.0091101 -2.139 0.032528 *
home_poss 0.0411316 0.0027986 14.697 < 2e-16 ***
away_poss -0.0193323 0.0028321 -6.826 1.05e-11 ***
home_shoton -0.0041805 0.0100472 -0.416 0.677375
away_shoton -0.0006627 0.0118894 -0.056 0.955556
home_shotoff -0.0291513 0.0109704 -2.657 0.007919 **
away_shotoff -0.0142992 0.0125488 -1.139 0.254593
home_corners -0.0280717 0.0114633 -2.449 0.014388 *
away_corners 0.0030107 0.0131407 0.229 0.818795
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.663 on 3024 degrees of freedom
Multiple R-squared: 0.1798, Adjusted R-squared: 0.1754
F-statistic: 41.43 on 16 and 3024 DF, p-value: < 2.2e-16
The model is of the form goal_difference = beta1x1 + beta2x2 …,
where goal_difference = home_goal - away_goal